Installing Packages and Libraries
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(rvest)
library(corrplot)
## corrplot 0.90 loaded
library(ggplot2)
library(cluster)
library(fpc)
library(pvclust)
library(mclust)
## Package 'mclust' version 5.4.7
## Type 'citation("mclust")' for citing this R package in publications.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 3.0.5 ✓ purrr 0.3.4
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x tidyr::expand() masks Matrix::expand()
## x dplyr::filter() masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag() masks stats::lag()
## x purrr::map() masks mclust::map()
## x tidyr::pack() masks Matrix::pack()
## x tidyr::unpack() masks Matrix::unpack()
library(bestglm)
## Loading required package: leaps
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
##
## some
## The following object is masked from 'package:dplyr':
##
## recode
library(Rcpp)
library(reactable)
Importing Data
##### per game stats (FGA, 3PA, RB, AST, STL, BLK, TOV)
per_game_url = "https://www.basketball-reference.com/leagues/NBA_2021.html"
per_game_page = per_game_url %>%
rvest::read_html()
per_game_df = per_game_page %>%
rvest::html_nodes("table") %>%
.[7] %>%
rvest::html_table(fill = TRUE)
#### advanced team stats
per_game_url = "https://www.basketball-reference.com/leagues/NBA_2021.html"
per_game_page = per_game_url %>%
rvest::read_html()
adv_game_df = per_game_page %>%
rvest::html_nodes("table") %>%
.[11] %>%
rvest::html_table(fill = TRUE)
#### advanced scoring stats
scoring_adv = read.csv("/Users/cartererickson/Desktop/School/STAT/STAT 495R/pbpstats_export.csv", sep = ",")
scoring_adv = scoring_adv[order(scoring_adv$Name),]
head(scoring_adv)
## Name GamesPlayed OffPoss Points FG2M FG2A Fg2Pct FG3M
## 16 ATL 72 7086 113.6944 28.36111 53.87500 0.5264243 12.43056
## 3 BKN 72 7187 118.5694 28.97222 51.23611 0.5654649 14.16667
## 20 BOS 72 7094 112.6250 27.86111 52.54167 0.5302670 13.59722
## 23 CHA 72 7083 109.4583 26.25000 50.80556 0.5166758 13.68056
## 17 CHI 72 7136 110.6806 29.59722 54.63889 0.5416878 12.55556
## 7 CLE 72 7032 103.8333 28.58333 56.02778 0.5101636 10.00000
## FG3A Fg3Pct NonHeaveFg3Pct FtPoints PtsAssisted2s PtsUnassisted2s
## 16 33.36111 0.3726062 0.3744770 19.68056 28.58333 28.13889
## 3 36.11111 0.3923077 0.3941267 18.12500 31.05556 26.88889
## 20 36.36111 0.3739496 0.3766833 16.11111 26.63889 29.08333
## 23 37.02778 0.3694674 0.3713208 15.91667 30.94444 21.55556
## 17 33.97222 0.3695830 0.3714521 13.81944 32.47222 26.72222
## 7 29.73611 0.3362915 0.3372365 16.66667 30.86111 26.30556
## PtsAssisted3s PtsUnassisted3s Assisted2sPct NonPutbacksAssisted2sPct
## 16 29.50000 7.791667 0.5039177 0.5635268
## 3 33.79167 8.708333 0.5359540 0.5783756
## 20 30.41667 10.375000 0.4780658 0.5272128
## 23 34.12500 6.916667 0.5894180 0.6300905
## 17 31.58333 6.083333 0.5485687 0.5952138
## 7 25.20833 4.791667 0.5398445 0.5884534
## Assisted3sPct FG3APct ShotQualityAvg EfgPct TsPct PtsPutbacks
## 16 0.7910615 0.3824232 0.5304958 0.5388473 0.5784286 6.000000
## 3 0.7950980 0.4134203 0.5354285 0.5749722 0.6088387 4.250000
## 20 0.7456588 0.4089986 0.5129291 0.5428058 0.5739742 5.194444
## 23 0.8314721 0.4215686 0.5330921 0.5324953 0.5632439 3.388889
## 17 0.8384956 0.3833856 0.5294977 0.5465517 0.5754854 4.638889
## 7 0.8402778 0.3467206 0.5345787 0.5081781 0.5416303 4.722222
## Fg2aBlocked FG2APctBlocked Fg3aBlocked FG3APctBlocked
## 16 4.861111 0.09022944 0.1805556 0.005412157
## 3 4.347222 0.08484684 0.2361111 0.006538462
## 20 4.347222 0.08273857 0.2916667 0.008021390
## 23 4.652778 0.09158010 0.1666667 0.004501125
## 17 4.875000 0.08922217 0.1805556 0.005314800
## 7 5.597222 0.09990084 0.3055556 0.010275572
#### advanced assist stats
assist_adv = read.csv("/Users/cartererickson/Desktop/School/STAT/STAT 495R/pbpstats_export_assists.csv", sep = ",")
assist_adv = assist_adv[order(assist_adv$Name), ]
head(assist_adv)
## Name GamesPlayed Assists AssistPoints TwoPtAssists ThreePtAssists
## 14 ATL 385 24.27013 57.94805 14.86234 9.407792
## 21 BKN 390 23.95128 58.12821 13.72564 10.225641
## 12 BOS 390 24.12821 58.09744 14.28718 9.841026
## 29 CHA 383 23.62402 56.34987 14.52219 9.101828
## 3 CHI 383 23.51697 55.75196 14.79896 8.718016
## 1 CLE 383 22.68407 54.71279 13.33943 9.344648
## AtRimAssists ShortMidRangeAssists LongMidRangeAssists Corner3Assists
## 14 10.228571 2.519481 2.114286 2.979221
## 21 10.138462 2.541026 1.046154 2.807692
## 12 8.897436 2.874359 2.515385 2.612821
## 29 9.490862 2.971279 2.060052 2.331593
## 3 9.104439 3.248042 2.446475 2.618799
## 1 9.161880 2.571802 1.605744 2.892950
## Arc3Assists
## 14 6.428571
## 21 7.417949
## 12 7.228205
## 29 6.770235
## 3 6.099217
## 1 6.451697
#### shot distribution
shot_dist = read.csv("/Users/cartererickson/Desktop/School/STAT/STAT 495R/pbpstats_export_shotdist.csv", sep = ",")
shot_dist = shot_dist[order(shot_dist$Name),]
head(shot_dist)
## Name GamesPlayed ShotQualityAvg AtRimFG3AFrequency Avg2ptShotDistance
## 14 ATL 385 0.5279957 0.7012365 6.697795
## 21 BKN 390 0.5239446 0.7357940 5.891474
## 12 BOS 390 0.5136905 0.6827757 7.202982
## 29 CHA 383 0.5156494 0.6877884 6.925698
## 3 CHI 383 0.5150962 0.6565109 7.195617
## 1 CLE 383 0.5244186 0.6834303 6.812521
## Avg3ptShotDistance AtRimFGM AtRimFGA AtRimFrequency AtRimAccuracy
## 14 25.59966 17.89091 29.02078 0.3305035 0.6164862
## 21 25.44432 18.02821 29.15897 0.3320680 0.6182730
## 12 25.67451 16.40769 26.15385 0.2980278 0.6273529
## 29 25.74308 16.77023 27.92950 0.3204806 0.6004487
## 3 25.43475 17.31070 28.56919 0.3239770 0.6059221
## 1 25.53150 17.31593 27.49347 0.3192070 0.6298196
## UnblockedAtRimAccuracy AtRimPctAssisted AtRimPctBlocked ShortMidRangeFGM
## 14 0.6908033 0.5717189 0.1075808 6.085714
## 21 0.6962765 0.5623667 0.1120295 6.497436
## 12 0.7013371 0.5422722 0.1054902 6.646154
## 29 0.6830072 0.5659349 0.1208750 6.313316
## 3 0.6833643 0.5259427 0.1133248 7.096606
## 1 0.7114353 0.5291013 0.1147198 6.629243
## ShortMidRangeFGA ShortMidRangeFrequency ShortMidRangeAccuracy
## 14 15.73766 0.1792285 0.3866975
## 21 16.25641 0.1851311 0.3996845
## 12 16.37179 0.1865595 0.4059514
## 29 16.04700 0.1841333 0.3934266
## 3 17.44386 0.1978149 0.4068253
## 1 16.59008 0.1926155 0.3995908
## UnblockedShortMidRangeAccuracy ShortMidRangePctAssisted
## 14 0.4378621 0.4139991
## 21 0.4431619 0.3910813
## 12 0.4491423 0.4324846
## 29 0.4321716 0.4706369
## 3 0.4489594 0.4576895
## 1 0.4406456 0.3879480
## ShortMidRangePctBlocked LongMidRangeFGM LongMidRangeFGA
## 14 0.11685097 4.314286 10.49610
## 21 0.09810726 2.723077 6.94359
## 12 0.09616288 4.833333 11.46667
## 29 0.08965181 4.310705 11.16188
## 3 0.09384823 4.945170 12.84595
## 1 0.09316966 4.331593 10.67624
## LongMidRangeFrequency LongMidRangeAccuracy UnblockedLongMidRangeAccuracy
## 14 0.11953499 0.4110369 0.4222166
## 21 0.07907493 0.3921713 0.4003015
## 12 0.13066472 0.4215116 0.4269536
## 29 0.12807837 0.3861988 0.3930017
## 3 0.14567419 0.3849593 0.3909992
## 1 0.12395417 0.4057227 0.4134064
## LongMidRangePctAssisted LongMidRangePctBlocked Corner3FGM Corner3FGA
## 14 0.4900662 0.02647859 3.077922 7.909091
## 21 0.3841808 0.02031019 2.933333 7.723077
## 12 0.5204244 0.01274597 2.779487 7.158974
## 29 0.4778922 0.01730994 2.428198 6.167102
## 3 0.4947202 0.01544715 2.759791 6.924282
## 1 0.3707052 0.01858645 3.088773 7.806789
## Corner3Frequency Corner3Accuracy UnblockedCorner3Accuracy Corner3PctAssisted
## 14 0.09007277 0.3891626 0.3931652 0.9679325
## 21 0.08795188 0.3798141 0.3833780 0.9571678
## 12 0.08157779 0.3882521 0.3904899 0.9400369
## 29 0.07076517 0.3937341 0.3979461 0.9602151
## 3 0.07852194 0.3985671 0.4022070 0.9489120
## 1 0.09063902 0.3956522 0.4004739 0.9366019
## Corner3PctBlocked Arc3FGM Arc3FGA Arc3Frequency Arc3Accuracy
## 14 0.010180624 8.384416 24.64416 0.2806602 0.3402192
## 21 0.009296149 9.697436 27.72821 0.3157741 0.3497318
## 12 0.005730659 9.630769 26.60513 0.3031702 0.3619892
## 29 0.010584251 9.046997 25.84334 0.2965426 0.3500707
## 3 0.009049774 7.608355 22.39948 0.2540120 0.3396666
## 1 0.012040134 8.248042 23.56397 0.2735843 0.3500277
## UnblockedArc3Accuracy Arc3PctAssisted Arc3PctBlocked NonHeaveArc3FGM
## 14 0.3426752 0.7667286 0.007166948 3227
## 21 0.3519777 0.7649392 0.006380618 3780
## 12 0.3645540 0.7505325 0.007035466 3755
## 29 0.3528513 0.7483405 0.007880380 3460
## 3 0.3418583 0.8016472 0.006411004 2913
## 1 0.3528820 0.7822096 0.008088643 3159
## NonHeaveArc3FGA NonHeaveArc3Accuracy HeaveAttempts HeaveMakes
## 14 9398 0.3433709 89 1
## 21 10729 0.3523162 83 2
## 12 10281 0.3652368 94 1
## 29 9783 0.3536747 110 5
## 3 8504 0.3425447 74 1
## 1 8970 0.3521739 55 0
#### misc/pace
pace_adv = read.csv("/Users/cartererickson/Desktop/School/STAT/STAT 495R/pbpstats_export_pace.csv", sep = ",")
head(pace_adv)
## Name GamesPlayed Pace SecondsPerPossOff SecondsPerPossDef
## 1 CLE 383 96.89494 15.27293 14.45038
## 2 NOP 390 100.52310 13.92902 14.72068
## 3 CHI 383 97.69253 14.70056 14.77969
## 4 DAL 393 96.19231 15.39162 14.54789
## 5 DEN 391 97.03901 14.94897 14.72979
## 6 HOU 390 99.38628 14.22337 14.75512
## SecondsExcludingORebsPerPossOff SecondsExcludingORebsPerPossDef
## 1 14.65544 13.88705
## 2 13.38284 14.13697
## 3 14.11909 14.23100
## 4 14.80348 13.97215
## 5 14.29970 14.16365
## 6 13.59348 14.15882
## FirstChancePoints Blocks Blocked2s Blocked3s BlockedAtRim
## 1 94.85117 3.577023 3.472585 0.10443864 2.339426
## 2 98.99744 5.261538 4.989744 0.27179487 3.117949
## 3 92.71279 4.182768 4.031332 0.15143603 2.751958
## 4 95.44529 4.178117 4.083969 0.09414758 2.511450
## 5 97.06394 4.475703 4.235294 0.24040921 2.659847
## 6 100.96410 4.823077 4.597436 0.22564103 2.751282
## BlockedShortMidRange BlockedLongMidRange BlockedCorner3 BlockedArc3
## 1 1.013055 0.1201044 0.02610966 0.07832898
## 2 1.643590 0.2282051 0.06153846 0.21025641
## 3 1.154047 0.1253264 0.04699739 0.10443864
## 4 1.437659 0.1348601 0.01526718 0.07888041
## 5 1.429668 0.1457801 0.09462916 0.14578005
## 6 1.669231 0.1769231 0.05641026 0.16923077
## RecoveredBlocks BlocksRecoveredPct Steals LostBallSteals BadPassSteals
## 1 1.986945 0.5554745 6.950392 821 1841
## 2 3.046154 0.5789474 7.661538 1028 1960
## 3 2.456919 0.5873908 7.848564 1045 1961
## 4 2.412214 0.5773447 6.704835 1013 1622
## 5 2.572890 0.5748571 7.664962 1077 1920
## 6 2.728205 0.5656566 8.310256 1151 2090
## DefensiveGoaltends
## 1 0.1749347
## 2 0.1666667
## 3 0.1383812
## 4 0.1781170
## 5 0.2710997
## 6 0.2000000
#### rebounds
rebound_adv = read.csv("/Users/cartererickson/Desktop/School/STAT/STAT 495R/pbpstats_export_rebounds.csv", sep = ",")
head(rebound_adv)
## Name GamesPlayed Rebounds DefRebounds FTDefRebounds DefFTReboundPct
## 1 CLE 383 47.52742 35.15405 2.112272 0.8949115
## 2 NOP 390 50.20000 37.76410 2.182051 0.8846154
## 3 CHI 383 48.53003 36.03133 2.240209 0.8800000
## 4 DAL 393 47.18066 35.97964 2.267176 0.8954774
## 5 DEN 391 49.75703 36.09207 2.375959 0.9198020
## 6 HOU 390 47.81795 35.78205 2.287179 0.8920000
## DefTwoPtRebounds DefTwoPtReboundPct DefThreePtRebounds DefThreePtReboundPct
## 1 18.05483 0.6987672 14.98695 0.7881368
## 2 19.34872 0.7007801 16.23333 0.7973552
## 3 18.38381 0.7134461 15.40731 0.7994852
## 4 18.29008 0.7079681 15.42239 0.7961382
## 5 18.30179 0.7086552 15.41432 0.8005047
## 6 18.16667 0.6869304 15.32821 0.7805197
## DefFGReboundPct OffRebounds FTOffRebounds OffFTReboundPct OffTwoPtRebounds
## 1 0.7366552 12.37337 0.2062663 0.08053007 8.154047
## 2 0.7417682 12.43590 0.3307692 0.11611161 8.341026
## 3 0.7502609 12.49869 0.2114883 0.09654350 8.587467
## 4 0.7457503 11.20102 0.2315522 0.09795479 6.519084
## 5 0.7478868 13.66496 0.2813299 0.11727079 8.928389
## 6 0.7268124 12.03590 0.2051282 0.07259528 6.235897
## OffTwoPtReboundPct OffThreePtRebounds OffThreePtReboundPct OffFGReboundPct
## 1 0.3123000 4.013055 0.2067249 0.2672785
## 2 0.3015947 3.764103 0.1998639 0.2603828
## 3 0.2948982 3.699739 0.2029795 0.2595125
## 4 0.2699968 4.450382 0.2016371 0.2373507
## 5 0.3366442 4.455243 0.2344865 0.2940053
## 6 0.3144963 5.594872 0.2079878 0.2531826
## DefAtRimReboundPct DefShortMidRangeReboundPct DefLongMidRangeReboundPct
## 1 0.6270492 0.7026247 0.8043564
## 2 0.6297801 0.7035771 0.8037383
## 3 0.6344351 0.7322075 0.8167267
## 4 0.6235775 0.7035061 0.8179211
## 5 0.6353591 0.7174030 0.8038685
## 6 0.6052083 0.6965844 0.7986196
## DefArc3ReboundPct DefCorner3ReboundPct OffAtRimReboundPct
## 1 0.7902387 0.7812865 0.3860104
## 2 0.8013820 0.7827606 0.3835240
## 3 0.8027317 0.7877270 0.3765778
## 4 0.8021201 0.7748503 0.3503023
## 5 0.8104796 0.7709321 0.4055666
## 6 0.7828512 0.7735554 0.3479466
## OffShortMidRangeReboundPct OffLongMidRangeReboundPct OffArc3ReboundPct
## 1 0.3124502 0.1921618 0.1967850
## 2 0.2959875 0.1710628 0.1878453
## 3 0.2817369 0.1942568 0.1965970
## 4 0.2746459 0.1719490 0.1923796
## 5 0.3235656 0.2385120 0.2239006
## 6 0.3223235 0.1787611 0.2045991
## OffCorner3ReboundPct SelfOReb SelfORebPct
## 1 0.2384442 593 0.03824573
## 2 0.2387543 727 0.04471095
## 3 0.2249047 628 0.03890713
## 4 0.2390925 555 0.03342769
## 5 0.2773623 871 0.05451245
## 6 0.2191333 677 0.04130819
Cleaning Data
##### per game stats (FGA, 3PA, RB, AST, STL, BLK, TOV)
bref_per_game = select(per_game_df[[1]], -c(Rk, G, MP, FG, 'FG%', '3P','3P%',
'2P','2P%', FT, 'FT%', PF, PTS))
bref_per_game$PlayoffFlag = ifelse(grepl("\\*", bref_per_game$Team), 1, 0)
bref_per_game$Team = sub("\\*", "", bref_per_game$Team)
bref_per_game = bref_per_game[-31, ]
bref_per_game = rename(bref_per_game, Franchise = Team)
#### advanced team stats
colnames(adv_game_df[[1]]) = c("Rk", "Team", "Age", "W", "L", "PW", "PL",
"MOV", "SOS", "SRS", "ORtg", "DRtg", "NRtg",
"Pace", "FTr", "3PAr", "TS%", "NA", "OeFG%",
"OTOV%", "ORB%", "FT/FGA", "NA", "DeFG%",
"OppTOV%", "DRB%", "OppFT/FGA", "NA.1", "Arena",
"Attend.", "Attend./G")
bref_adv_game = select(adv_game_df[[1]], -c(Rk, W, L, PW, PL, MOV, SOS, SRS,
FTr, `TS%`, `NA`, NA.1, Arena,
Attend., `Attend./G` ))
bref_adv_game = bref_adv_game[-1,]
bref_adv_game = bref_adv_game[-31, ]
bref_adv_game$Team = sub("\\*", "", bref_adv_game$Team)
bref_adv_game = rename(bref_adv_game, Franchise = Team)
# advanced scoring data
scoring_adv = scoring_adv %>%
dplyr::select(!c(GamesPlayed, FG2M, Fg2Pct, FG3M, Fg3Pct,
NonHeaveFg3Pct, Assisted2sPct,
NonPutbacksAssisted2sPct, Assisted3sPct,
EfgPct, TsPct, Fg2aBlocked, FG2APctBlocked,
Fg3aBlocked, FG3APctBlocked))
# advanced assist data
assist_adv = assist_adv %>%
dplyr::select(!c(GamesPlayed))
# shot distribution
shot_dist = shot_dist %>%
dplyr::select(!c(GamesPlayed, AtRimFGM, AtRimAccuracy,
UnblockedAtRimAccuracy, AtRimPctAssisted,
AtRimPctBlocked, ShortMidRangeFGM,
ShortMidRangeAccuracy,
UnblockedShortMidRangeAccuracy,
ShortMidRangePctAssisted,
ShortMidRangePctBlocked, LongMidRangeFGM,
LongMidRangeAccuracy,
UnblockedLongMidRangeAccuracy,
LongMidRangePctAssisted,
LongMidRangePctBlocked, Corner3FGM,
Corner3Accuracy, UnblockedCorner3Accuracy,
Corner3PctAssisted, Corner3PctBlocked, Arc3FGM,
Arc3Accuracy, UnblockedArc3Accuracy,
Arc3PctAssisted, Arc3PctBlocked, NonHeaveArc3FGM,
NonHeaveArc3Accuracy, HeaveAttempts, HeaveMakes))
# misc/pace
pace_adv = pace_adv %>%
dplyr::select(!c(GamesPlayed, Blocked2s, Blocked3s, BlockedAtRim,
BlockedShortMidRange, BlockedLongMidRange, BlockedCorner3,
BlockedArc3, BlocksRecoveredPct, LostBallSteals,
BadPassSteals, DefensiveGoaltends, FirstChancePoints, Pace))
# rebounds
rebound_adv = rebound_adv %>%
dplyr::select(!c(GamesPlayed, FTDefRebounds, DefFTReboundPct,
DefTwoPtRebounds, DefTwoPtReboundPct,
DefThreePtRebounds, DefThreePtReboundPct,
DefFGReboundPct, FTOffRebounds,
OffFTReboundPct, OffTwoPtRebounds,
OffTwoPtReboundPct, OffThreePtRebounds,
OffThreePtReboundPct, OffFGReboundPct,
DefAtRimReboundPct, DefShortMidRangeReboundPct,
DefLongMidRangeReboundPct,
DefArc3ReboundPct, DefCorner3ReboundPct,
OffAtRimReboundPct, OffShortMidRangeReboundPct,
OffLongMidRangeReboundPct, OffArc3ReboundPct,
OffCorner3ReboundPct, SelfORebPct))
Combining Data
# Combining Basketball Reference data
bref_all = bref_per_game %>%
dplyr::inner_join(bref_adv_game, by = "Franchise") %>%
dplyr::select(!c(PlayoffFlag))
# Combining scoring, shot distribution, and assisting PBP data
pbp_offense = scoring_adv %>%
dplyr::inner_join(assist_adv, by = "Name") %>%
dplyr::inner_join(shot_dist, by = "Name")
# Combining Defense and Rebounding PBP data
pbp_defense = pace_adv %>%
dplyr::inner_join(rebound_adv, by = "Name")
# Changing Basketball Reference form to data frame and changing row names
bref_all = as.data.frame(bref_all)
rownames(bref_all) = bref_all$Franchise
bref_all = bref_all[,-c(1)]
# Changing row names of PBP data
rownames(pbp_offense) = pbp_offense$Name
pbp_offense = pbp_offense[, -c(1)]
rownames(pbp_defense) = pbp_defense$Name
pbp_defense = pbp_defense[, -c(1)]
# changing to numeric to allow for analysis to be performed
bref_all[, c(1:25)] <- sapply(bref_all[, c(1:25)], as.numeric)
EDA
summary(bref_all)
## FGA 3PA 2PA FTA ORB
## Min. :6029 Min. :2046 Min. :3246 Min. :1258 Min. :574.0
## 1st Qu.:6282 1st Qu.:2266 1st Qu.:3666 1st Qu.:1510 1st Qu.:671.8
## Median :6364 Median :2476 Median :3873 Median :1541 Median :696.5
## Mean :6366 Mean :2494 Mean :3872 Mean :1571 Mean :707.7
## 3rd Qu.:6423 3rd Qu.:2668 3rd Qu.:4055 3rd Qu.:1621 3rd Qu.:757.8
## Max. :6610 Max. :3098 Max. :4472 Max. :1884 Max. :845.0
## DRB TRB AST STL BLK
## Min. :2307 Min. :2981 Min. :1531 Min. :450.0 Min. :286.0
## 1st Qu.:2412 1st Qu.:3094 1st Qu.:1704 1st Qu.:504.2 1st Qu.:316.5
## Median :2490 Median :3186 Median :1767 Median :544.0 Median :351.0
## Mean :2482 Mean :3190 Mean :1786 Mean :545.2 Mean :350.8
## 3rd Qu.:2544 3rd Qu.:3253 3rd Qu.:1919 3rd Qu.:578.8 3rd Qu.:371.0
## Max. :2724 Max. :3474 Max. :1991 Max. :655.0 Max. :460.0
## TOV Age ORtg DRtg
## Min. : 799.0 Min. :22.80 Min. :103.5 Min. :107.1
## 1st Qu.: 952.2 1st Qu.:25.18 1st Qu.:110.7 1st Qu.:111.2
## Median :1003.5 Median :26.25 Median :112.0 Median :112.5
## Mean : 996.2 Mean :26.09 Mean :112.4 Mean :112.3
## 3rd Qu.:1058.0 3rd Qu.:27.00 3rd Qu.:115.6 3rd Qu.:113.8
## Max. :1162.0 Max. :28.80 Max. :118.3 Max. :117.2
## NRtg Pace 3PAr OeFG%
## Min. :-10.50000 Min. : 95.90 Min. :0.3140 Min. :0.4900
## 1st Qu.: -1.87500 1st Qu.: 97.67 1st Qu.:0.3578 1st Qu.:0.5250
## Median : 0.50000 Median : 98.80 Median :0.3885 Median :0.5395
## Mean : 0.02333 Mean : 99.18 Mean :0.3919 Mean :0.5379
## 3rd Qu.: 2.70000 3rd Qu.:100.33 3rd Qu.:0.4198 3rd Qu.:0.5497
## Max. : 9.30000 Max. :104.10 Max. :0.4880 Max. :0.5750
## OTOV% ORB% FT/FGA DeFG%
## Min. : 9.90 Min. :17.90 Min. :0.1560 Min. :0.5070
## 1st Qu.:11.90 1st Qu.:21.12 1st Qu.:0.1810 1st Qu.:0.5310
## Median :12.25 Median :22.10 Median :0.1895 Median :0.5390
## Mean :12.36 Mean :22.19 Mean :0.1920 Mean :0.5380
## 3rd Qu.:13.07 3rd Qu.:23.45 3rd Qu.:0.1983 3rd Qu.:0.5467
## Max. :14.20 Max. :26.30 Max. :0.2260 Max. :0.5570
## OppTOV% DRB% OppFT/FGA
## Min. :10.30 Min. :74.90 Min. :0.1570
## 1st Qu.:11.53 1st Qu.:76.95 1st Qu.:0.1832
## Median :12.45 Median :77.85 Median :0.1955
## Mean :12.36 Mean :77.80 Mean :0.1921
## 3rd Qu.:13.07 3rd Qu.:78.58 3rd Qu.:0.2008
## Max. :14.40 Max. :80.30 Max. :0.2340
# check for correlation to see how variables affect pace
team_cor = cor(bref_all)
corrplot(team_cor)

# correlation matrix of pbp data
pbp_offense_cor = cor(pbp_offense)
corrplot(pbp_offense_cor)

pbp_defense_cor = cor(pbp_defense)
corrplot(pbp_defense_cor)

# relationship between seconds per defensive possesion and defensive rebounds
ggplot(aes(x=SecondsPerPossDef,y=DefRebounds),data=pbp_defense) +
geom_point()

# Scatterplot matrix for all variables
pairs(bref_all, horInd = 1:13, verInd = 1:13)

pairs(bref_all, horInd = 13:25, verInd = 13:25)

LASSO 1
# get modeling matrix (remove intercept column because LASSO & RIDGE do that automatically)
# basketball reference data
brefX = model.matrix(NRtg ~ ., data = bref_all)[,-c(1)]
lambda = exp(seq(-15, 15, length = 1000))
lasso_lm = glmnet(brefX, bref_all$NRtg, alpha = 1, lambda = lambda)
plot(lasso_lm)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

lasso_cv = cv.glmnet(brefX, bref_all$NRtg, alpha = 1, lambda = lambda)
lbestlam = lasso_cv$lambda.min
plot(lasso_cv)

lcoefs = predict(lasso_lm, s = lbestlam, type = "coefficient")
lvars = names(lcoefs[lcoefs[, 1]!=0, ])[-c(1)]
regular_lm_lasso = lm(bref_all$NRtg ~ . , data = data.frame(brefX[, lvars]))
regular_sumary_lasso = summary(regular_lm_lasso)
cbind(lcoefs[lcoefs!=0], regular_sumary_lasso$coefficients)
## <sparse>[ <logic> ] : .M.sub.i.logical() maybe inefficient
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.454678e-06 -2.075628e-14 1.049645e-13 -1.977458e-01 0.84478187
## ORtg 9.999999e-01 1.000000e+00 4.635633e-16 2.157203e+15 0.00000000
## DRtg -9.999999e-01 -1.000000e+00 1.299090e-15 -7.697694e+14 0.00000000
## X.DeFG.. -6.108012e-06 -4.573570e-13 2.593316e-13 -1.763600e+00 0.08954933
lcoefs
## 25 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -4.454678e-06
## FGA .
## `3PA` .
## `2PA` .
## FTA .
## ORB .
## DRB .
## TRB .
## AST .
## STL .
## BLK .
## TOV .
## Age .
## ORtg 9.999999e-01
## DRtg -9.999999e-01
## Pace .
## `3PAr` .
## `OeFG%` .
## `OTOV%` .
## `ORB%` .
## `FT/FGA` .
## `DeFG%` -6.108012e-06
## `OppTOV%` .
## `DRB%` .
## `OppFT/FGA` .
# Removing variables recommended by LASSO and those I found unhelpful
bref_final = bref_all %>%
dplyr::select(!c(`2PA`, NRtg, `3PAr`, `OTOV%`,
`FT/FGA`, `DeFG%`, `OppTOV%`, `ORB%`, `DRB%`,
`OeFG%`, `OppFT/FGA`, Age))
bref_final
## FGA 3PA FTA ORB DRB TRB AST STL BLK TOV ORtg
## Milwaukee Bucks 6610 2669 1539 741 2724 3465 1834 585 334 995 117.2
## Brooklyn Nets 6289 2600 1623 640 2559 3199 1929 484 379 975 118.3
## Washington Wizards 6547 2088 1884 697 2557 3254 1835 528 297 1037 111.2
## Utah Jazz 6344 3098 1546 765 2709 3474 1703 474 371 1023 117.6
## Portland Trail Blazers 6558 2939 1558 766 2441 3207 1531 496 363 799 117.8
## Indiana Pacers 6567 2445 1493 648 2424 3072 1973 611 460 975 112.4
## Phoenix Suns 6357 2490 1347 630 2462 3092 1939 517 312 902 117.2
## Denver Nuggets 6422 2462 1406 758 2442 3200 1933 582 323 972 117.1
## New Orleans Pelicans 6412 2190 1878 845 2568 3413 1872 545 315 1052 113.5
## Los Angeles Clippers 6242 2498 1387 678 2501 3179 1756 509 295 950 117.6
## Sacramento Kings 6382 2400 1585 674 2307 2981 1836 543 358 963 113.6
## Golden State Warriors 6347 2789 1520 574 2524 3098 1991 587 342 1080 111.1
## Atlanta Hawks 6281 2402 1745 760 2525 3285 1737 503 342 953 115.7
## Philadelphia 76ers 6257 2169 1836 722 2522 3244 1706 655 447 1040 113.2
## Memphis Grizzlies 6608 2258 1536 803 2543 3346 1938 655 364 957 112.0
## Boston Celtics 6401 2618 1496 765 2421 3186 1689 556 383 1012 114.0
## Dallas Mavericks 6287 2744 1524 657 2463 3120 1647 450 311 869 115.4
## Minnesota Timberwolves 6546 2706 1662 757 2376 3133 1846 632 398 1027 109.5
## Toronto Raptors 6383 2831 1536 680 2314 2994 1735 618 389 952 112.0
## San Antonio Spurs 6518 2046 1584 669 2489 3158 1759 505 366 821 111.0
## Chicago Bulls 6380 2446 1258 693 2544 3237 1927 482 304 1089 111.1
## Los Angeles Lakers 6197 2248 1679 695 2490 3185 1775 562 386 1095 109.9
## Charlotte Hornets 6324 2666 1505 762 2389 3151 1933 565 344 1069 110.9
## Houston Rockets 6372 2923 1606 671 2396 3067 1699 546 361 1060 107.1
## Miami Heat 6029 2606 1520 579 2409 2988 1895 569 286 1013 111.2
## New York Knicks 6225 2163 1506 696 2554 3250 1541 507 365 932 110.6
## Detroit Pistons 6162 2370 1683 694 2381 3075 1743 531 371 1075 108.0
## Oklahoma City Thunder 6338 2529 1536 715 2568 3283 1588 504 316 1162 103.5
## Orlando Magic 6423 2288 1543 747 2525 3272 1571 496 318 924 105.1
## Cleveland Cavaliers 6175 2141 1614 751 2327 3078 1716 559 325 1114 105.8
## DRtg Pace
## Milwaukee Bucks 111.4 102.2
## Brooklyn Nets 113.8 99.5
## Washington Wizards 113.0 104.1
## Utah Jazz 108.3 98.5
## Portland Trail Blazers 116.0 98.4
## Indiana Pacers 112.4 101.6
## Phoenix Suns 111.3 97.2
## Denver Nuggets 112.1 97.1
## New Orleans Pelicans 113.8 100.1
## Los Angeles Clippers 111.2 96.9
## Sacramento Kings 117.2 100.0
## Golden State Warriors 110.1 102.2
## Atlanta Hawks 113.3 97.6
## Philadelphia 76ers 107.6 99.5
## Memphis Grizzlies 111.0 100.4
## Boston Celtics 112.5 98.3
## Dallas Mavericks 113.0 97.3
## Minnesota Timberwolves 115.0 101.6
## Toronto Raptors 112.5 99.2
## San Antonio Spurs 112.8 98.9
## Chicago Bulls 112.0 99.0
## Los Angeles Lakers 107.1 98.7
## Charlotte Hornets 112.8 98.3
## Houston Rockets 114.9 101.4
## Miami Heat 111.2 96.6
## New York Knicks 108.2 95.9
## Detroit Pistons 112.5 97.9
## Oklahoma City Thunder 114.0 101.0
## Orlando Magic 114.5 98.7
## Cleveland Cavaliers 114.4 97.3
LASSO 2
# get modeling matrix (remove intercept column because LASSO & RIDGE do that automatically)
# basketball reference data
pbpoX = model.matrix(OffPoss ~ ., data = pbp_offense)[,-c(1)]
lambda = exp(seq(-15, 15, length = 1000))
lasso_lm = glmnet(pbpoX, pbp_offense$OffPoss, alpha = 1, lambda = lambda)
## Warning: from glmnet Fortran code (error code -730); Convergence for 730th
## lambda value not reached after maxit=100000 iterations; solutions for larger
## lambdas returned
plot(lasso_lm)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

lasso_cv = cv.glmnet(pbpoX, pbp_offense$OffPoss, alpha = 1, lambda = lambda)
## Warning: from glmnet Fortran code (error code -730); Convergence for 730th
## lambda value not reached after maxit=100000 iterations; solutions for larger
## lambdas returned
## Warning: from glmnet Fortran code (error code -740); Convergence for 740th
## lambda value not reached after maxit=100000 iterations; solutions for larger
## lambdas returned
## Warning: from glmnet Fortran code (error code -661); Convergence for 661th
## lambda value not reached after maxit=100000 iterations; solutions for larger
## lambdas returned
## Warning: from glmnet Fortran code (error code -636); Convergence for 636th
## lambda value not reached after maxit=100000 iterations; solutions for larger
## lambdas returned
## Warning: from glmnet Fortran code (error code -671); Convergence for 671th
## lambda value not reached after maxit=100000 iterations; solutions for larger
## lambdas returned
lbestlam = lasso_cv$lambda.min
plot(lasso_cv)

lcoefs = predict(lasso_lm, s = lbestlam, type = "coefficient")
lvars = names(lcoefs[lcoefs[, 1]!=0, ])[-c(1)]
regular_lm_lasso = lm(pbp_offense$OffPoss ~ . , data = data.frame(pbpoX[, lvars]))
regular_sumary_lasso = summary(regular_lm_lasso)
cbind(lcoefs[lcoefs!=0], regular_sumary_lasso$coefficients)
## <sparse>[ <logic> ] : .M.sub.i.logical() maybe inefficient
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 106.8799456 50.226619 2519.456250 0.0199355 9.843142e-01
## FG2A 44.5868615 59.071783 9.661006 6.1144545 8.922583e-06
## FG3A 46.9471921 75.129683 13.062477 5.7515647 1.881936e-05
## FtPoints 16.6628720 20.818434 11.044274 1.8849979 7.567999e-02
## PtsAssisted2s 0.5541723 -3.551537 6.230448 -0.5700291 5.757045e-01
## PtsAssisted3s -8.6827225 -21.224589 9.358974 -2.2678328 3.588468e-02
## PtsUnassisted3s -5.5118119 -19.171310 10.404971 -1.8425145 8.193452e-02
## ShotQualityAvg.x 310.1949445 345.753350 1546.263612 0.2236057 8.255823e-01
## PtsPutbacks -36.0523029 -62.648877 22.826839 -2.7445271 1.332702e-02
## AtRimAssists 49.7276794 65.193560 20.600352 3.1646819 5.362787e-03
## Avg3ptShotDistance 95.6834436 48.770663 93.862937 0.5195945 6.096771e-01
## Corner3FGA 16.4559906 15.771654 18.935731 0.8329044 4.158138e-01
lcoefs
## 36 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 106.8799456
## Points .
## FG2A 44.5868615
## FG3A 46.9471921
## FtPoints 16.6628720
## PtsAssisted2s 0.5541723
## PtsUnassisted2s .
## PtsAssisted3s -8.6827225
## PtsUnassisted3s -5.5118119
## FG3APct .
## ShotQualityAvg.x 310.1949445
## PtsPutbacks -36.0523029
## Assists .
## AssistPoints .
## TwoPtAssists .
## ThreePtAssists .
## AtRimAssists 49.7276794
## ShortMidRangeAssists .
## LongMidRangeAssists .
## Corner3Assists .
## Arc3Assists .
## ShotQualityAvg.y .
## AtRimFG3AFrequency .
## Avg2ptShotDistance .
## Avg3ptShotDistance 95.6834436
## AtRimFGA .
## AtRimFrequency .
## ShortMidRangeFGA .
## ShortMidRangeFrequency .
## LongMidRangeFGA .
## LongMidRangeFrequency .
## Corner3FGA 16.4559906
## Corner3Frequency .
## Arc3FGA .
## Arc3Frequency .
## NonHeaveArc3FGA .
# Removing variables recommended by LASSO
pbpo_final = pbp_offense %>%
dplyr::select(!c(Points, PtsAssisted2s, FG3APct, Assists,
AssistPoints, TwoPtAssists, LongMidRangeAssists,
Arc3Assists, NonHeaveArc3FGA, Corner3Frequency,
ShortMidRangeFrequency, LongMidRangeFrequency,
AtRimFrequency, Avg2ptShotDistance,
AtRimFG3AFrequency, ShotQualityAvg.y))
rename(pbpo_final, ShotQualityAvg = ShotQualityAvg.x)
## OffPoss FG2A FG3A FtPoints PtsUnassisted2s PtsAssisted3s
## ATL 7086 53.87500 33.36111 19.68056 28.13889 29.50000
## BKN 7187 51.23611 36.11111 18.12500 26.88889 33.79167
## BOS 7094 52.54167 36.36111 16.11111 29.08333 30.41667
## CHA 7083 50.80556 37.02778 15.91667 21.55556 34.12500
## CHI 7136 54.63889 33.97222 13.81944 26.72222 31.58333
## CLE 7032 56.02778 29.73611 16.66667 26.30556 25.20833
## DAL 6996 49.20833 38.11111 16.47222 30.27778 32.20833
## DEN 7044 55.00000 34.19444 15.68056 28.97222 32.87500
## DET 7050 52.66667 32.91667 17.75000 26.61111 31.25000
## GSW 7344 49.41667 38.73611 16.56944 21.63889 35.20833
## HOU 7262 47.90278 40.59722 16.51389 26.16667 33.62500
## IND 7353 57.25000 33.95833 16.43056 28.75000 32.58333
## LAC 6970 52.00000 34.69444 16.16667 30.08333 35.70833
## LAL 7099 54.84722 31.22222 17.23611 29.22222 29.08333
## MEM 7229 60.41667 31.36111 16.44444 29.00000 29.16667
## MIA 6949 47.54167 36.19444 16.68056 23.02778 34.66667
## MIL 7348 54.73611 37.06944 16.23611 32.25000 33.83333
## MIN 7305 53.33333 37.58333 17.56944 25.83333 32.83333
## NOP 7224 58.63889 30.41667 19.01389 30.27778 27.66667
## NYK 6926 56.41667 30.04167 16.40278 32.33333 29.70833
## OKC 7274 52.90278 35.12500 15.45833 29.33333 29.41667
## ORL 7081 57.43056 31.77778 16.61111 29.08333 27.00000
## PHI 7179 56.77778 30.12500 19.56944 33.02778 30.20833
## PHX 7062 53.70833 34.58333 15.61111 28.80556 33.25000
## POR 7078 50.26389 40.81944 17.80556 30.41667 32.70833
## SAC 7189 55.30556 33.33333 16.40278 29.47222 29.33333
## SAS 7174 62.11111 28.41667 17.41667 32.41667 26.16667
## TOR 7105 49.33333 39.31944 17.38889 26.08333 35.66667
## UTA 7124 45.08333 43.02778 17.15278 27.27778 38.29167
## WAS 7506 61.91667 29.00000 20.12500 32.91667 26.87500
## PtsUnassisted3s ShotQualityAvg PtsPutbacks ThreePtAssists AtRimAssists
## ATL 7.791667 0.5304958 6.000000 9.407792 10.228571
## BKN 8.708333 0.5354285 4.250000 10.225641 10.138462
## BOS 10.375000 0.5129291 5.194444 9.841026 8.897436
## CHA 6.916667 0.5330921 3.388889 9.101828 9.490862
## CHI 6.083333 0.5294977 4.638889 8.718016 9.104439
## CLE 4.791667 0.5345787 4.722222 9.344648 9.161880
## DAL 9.208333 0.5239492 4.000000 10.170483 7.969466
## DEN 5.750000 0.5126785 6.305556 9.780051 10.690537
## DET 3.416667 0.5214913 4.805556 9.335938 8.653646
## GSW 8.458333 0.5204043 3.416667 10.049608 11.647520
## HOU 7.708333 0.5324225 4.472222 10.743590 10.046154
## IND 4.458333 0.5377273 4.611111 8.437340 9.723785
## LAC 7.083333 0.5147173 4.583333 9.002564 9.694872
## LAL 4.083333 0.5230556 4.722222 8.473008 11.053985
## MEM 4.291667 0.5248266 6.472222 8.480818 9.263427
## MIA 4.166667 0.5220335 2.944444 10.281330 9.317136
## MIL 9.416667 0.5283572 4.527778 9.831202 10.703325
## MIN 6.500000 0.5407673 5.416667 8.510471 10.060209
## NOP 4.083333 0.5358961 6.388889 9.302564 11.112821
## NYK 5.583333 0.5154981 4.333333 8.132812 8.289062
## OKC 6.291667 0.5264936 5.194444 8.541026 9.446154
## ORL 5.666667 0.5030609 5.333333 8.831202 8.132992
## PHI 3.583333 0.5152936 5.194444 10.028133 9.979540
## PHX 5.916667 0.5136009 4.111111 8.411765 9.994885
## POR 14.458333 0.5247641 5.777778 8.622449 8.178571
## SAC 7.083333 0.5200065 4.611111 8.892308 9.076923
## SAS 3.666667 0.5095367 4.416667 8.449871 7.984576
## TOR 7.708333 0.5201906 4.555556 10.028205 8.884615
## UTA 11.916667 0.5194589 4.861111 9.882051 9.733333
## WAS 3.666667 0.4994307 4.416667 9.069231 9.687179
## ShortMidRangeAssists Corner3Assists Avg3ptShotDistance AtRimFGA
## ATL 2.519481 2.979221 25.59966 29.02078
## BKN 2.541026 2.807692 25.44432 29.15897
## BOS 2.874359 2.612821 25.67451 26.15385
## CHA 2.971279 2.331593 25.74308 27.92950
## CHI 3.248042 2.618799 25.43475 28.56919
## CLE 2.571802 2.892950 25.53150 27.49347
## DAL 2.300254 2.735369 25.61850 23.07888
## DEN 3.641944 2.514066 25.42626 28.69054
## DET 2.895833 2.786458 25.68576 26.77083
## GSW 3.234987 2.579634 26.11247 25.69974
## HOU 1.333333 3.582051 25.64567 28.76667
## IND 3.145780 2.552430 25.90705 28.70844
## LAC 2.438462 2.843590 25.55084 28.10513
## LAL 2.586118 2.478149 25.61972 31.60154
## MEM 3.777494 2.370844 25.54225 27.30435
## MIA 2.682864 3.117647 25.43866 26.61893
## MIL 2.485934 2.846547 25.79341 29.98721
## MIN 3.065445 2.353403 25.40405 28.94241
## NOP 3.210256 2.635897 25.45126 30.96154
## NYK 2.380208 2.325521 25.51956 28.18750
## OKC 2.294872 2.569231 25.41701 30.28462
## ORL 3.358056 1.897698 25.69389 25.03581
## PHI 2.951407 2.329923 25.70648 27.37340
## PHX 3.135550 2.644501 25.47443 27.86701
## POR 2.204082 2.142857 25.71303 28.38010
## SAC 3.423077 2.525641 25.44977 26.40256
## SAS 3.637532 2.416452 25.40403 23.59640
## TOR 2.751282 3.248718 25.54064 27.88974
## UTA 2.023077 3.664103 25.24738 26.49231
## WAS 3.517949 2.264103 25.77688 26.93590
## ShortMidRangeFGA LongMidRangeFGA Corner3FGA Arc3FGA Arc3Frequency
## ATL 15.73766 10.496104 7.909091 24.64416 0.2806602
## BKN 16.25641 6.943590 7.723077 27.72821 0.3157741
## BOS 16.37179 11.466667 7.158974 26.60513 0.3031702
## CHA 16.04700 11.161880 6.167102 25.84334 0.2965426
## CHI 17.44386 12.845953 6.924282 22.39948 0.2540120
## CLE 16.59008 10.676240 7.806789 23.56397 0.2735843
## DAL 15.85496 11.882952 7.267176 28.36132 0.3280841
## DEN 18.32992 10.324808 6.414322 24.70588 0.2792715
## DET 18.67188 11.328125 7.653646 22.73438 0.2608384
## GSW 15.01305 14.067885 6.671018 26.16710 0.2986471
## HOU 10.93846 4.971795 10.241026 32.53077 0.3719982
## IND 16.30691 15.611253 6.391304 20.37852 0.2331734
## LAC 15.87949 12.943590 7.276923 22.12564 0.2562892
## LAL 15.86889 11.084833 6.874036 22.75321 0.2580241
## MEM 19.70077 10.703325 6.562660 22.19437 0.2566848
## MIA 16.58056 10.184143 8.652174 23.48849 0.2746411
## MIL 15.00256 10.322251 7.557545 24.70077 0.2820678
## MIN 17.63874 12.863874 6.602094 22.64136 0.2552909
## NOP 17.25128 11.123077 7.248718 23.01538 0.2568681
## NYK 17.22135 15.575521 6.169271 20.89323 0.2372967
## OKC 15.72821 11.951282 7.346154 23.38205 0.2636311
## ORL 18.41176 14.237852 5.465473 24.76215 0.2816664
## PHI 17.67519 11.621483 6.002558 24.27366 0.2791799
## PHX 17.98721 12.971867 7.378517 21.64962 0.2464266
## POR 15.06888 13.591837 5.885204 26.16582 0.2936949
## SAC 19.47179 12.810256 6.533333 22.42308 0.2558514
## SAS 20.41388 17.521851 5.964010 19.85090 0.2272647
## TOR 16.07179 10.243590 8.566667 24.67949 0.2822084
## UTA 16.08462 8.425641 9.505128 23.76923 0.2820372
## WAS 17.44872 15.269231 6.497436 22.64872 0.2550531
Cluster Analysis
# Standardize variables
scale_adv_game = scale(bref_final)
scale_pbp_offense = scale(pbpo_final)
scale_pbp_defense = scale(pbp_defense)
# Determine number of clusters for Basketball Reference data
wss <- (nrow(scale_adv_game)-1)*sum(apply(scale_adv_game,2,var))
for (i in 2:25) wss[i] <- sum(kmeans(scale_adv_game,
centers=i)$withinss)
plot(1:25, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")

# Determine number of clusters for PBP shot and assist data
wss2 <- (nrow(scale_pbp_offense)-1)*sum(apply(scale_pbp_offense,2,var))
for (i in 2:21) wss2[i] <- sum(kmeans(scale_pbp_offense,
centers=i)$withinss)
plot(1:21, wss2, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")

# Determine number of clusters for PBP rebound, pace, and shot dist data
wss3 <- (nrow(scale_pbp_defense)-1)*sum(apply(scale_pbp_defense,2,var))
for (i in 2:28) wss3[i] <- sum(kmeans(scale_pbp_defense,
centers=i)$withinss)
plot(1:28, wss3, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")

K-Means Cluster Analysis 1
# K-Means Cluster Analysis for Basketball Reference data
fit <- kmeans(scale_adv_game, 6) # 6 cluster solution
# get cluster means
aggregate(scale_adv_game,by=list(fit$cluster),FUN=mean)
## Group.1 FGA 3PA FTA ORB DRB TRB
## 1 1 0.24820470 0.7219853 -0.04381108 0.16866289 -1.13941025 -0.8144573
## 2 2 0.87047760 -0.5768048 0.57901674 0.81259951 0.98435238 1.1669719
## 3 3 0.64035404 0.4514550 -0.45942432 -1.56376907 -0.07752893 -0.8170627
## 4 4 -0.98460010 -1.1044000 0.72821124 -0.05496363 0.39957219 0.2876394
## 5 5 -1.71958659 -0.4470482 0.24510524 -0.53454822 -1.08805561 -1.1140817
## 6 6 0.01197582 0.2880352 -0.52123347 -0.09915007 0.31508553 0.1998365
## AST STL BLK TOV ORtg DRtg
## 1 0.028383847 0.5868219 0.5105069 0.2091850 -0.2924702 0.76578960
## 2 -0.097208397 0.1299214 -0.6421219 0.2963454 -0.4841173 0.26087338
## 3 1.477718346 1.0033162 1.2004888 0.3716203 -0.1508179 -0.45442459
## 4 -0.843226328 0.5495239 1.1606054 0.3102119 -0.2799714 -1.97618597
## 5 -0.009293826 0.1454622 -0.5623552 0.8450599 -1.0048975 0.15568250
## 6 0.001507107 -0.8392050 -0.3406038 -0.7242239 0.8815767 0.02103818
## Pace
## 1 0.3170937
## 2 0.9734435
## 3 1.3911206
## 4 -0.5864528
## 5 -0.9785579
## 6 -0.5830432
# append cluster assignment
clust_adv_game <- data.frame(scale_adv_game, fit$cluster)
# Cluster Plot against 1st 2 principal components
# vary parameters for most readable graph
clusplot(scale_adv_game, fit$cluster, color=TRUE, shade=TRUE,
labels=2, lines=0)

K-Means Cluster Analysis 2
# K-Means Cluster Analysis for PBP shot and assist data
fit2 <- kmeans(scale_pbp_offense, 6) # 6 cluster solution
# get cluster means
aggregate(scale_pbp_offense,by=list(fit2$cluster),FUN=mean)
## Group.1 OffPoss FG2A FG3A FtPoints PtsUnassisted2s
## 1 1 -0.17933897 -1.3391055 1.16609561 0.15194415 -0.83966646
## 2 2 0.05575049 -0.6301804 0.89397399 -0.33345210 -0.28835466
## 3 3 1.40881000 1.9842318 -1.56834107 1.33896487 1.42752590
## 4 4 -0.02265965 0.1309756 -0.09520147 -0.77592683 0.14658876
## 5 5 -0.29145666 0.4985675 -0.91374351 0.87746029 0.02911422
## 6 6 -0.15516398 0.5995079 -0.62270926 0.08078479 0.32180504
## PtsAssisted3s PtsUnassisted3s ShotQualityAvg.x PtsPutbacks ThreePtAssists
## 1 1.1801724 0.5343864 0.3142766 -0.6758365 1.34846958
## 2 0.5101673 1.2010570 0.1177746 -0.4778973 0.47162693
## 3 -1.5589662 -1.1191667 -1.8002683 -0.4396218 -0.70309481
## 4 0.2173920 -0.2417712 0.2713339 -0.1068590 -1.01413738
## 5 -1.1352865 -0.5443602 0.8176423 0.7906631 -0.18421500
## 6 -0.4707552 -0.6283502 -0.6412333 0.7851951 -0.05501636
## AtRimAssists ShortMidRangeAssists Corner3Assists Avg3ptShotDistance
## 1 0.08293149 -1.04506939 1.6214609 -0.689327291
## 2 -0.06679966 -0.29431150 -0.2944604 1.073089193
## 3 -0.74370880 1.34368145 -0.8135606 0.027615043
## 4 -0.07503682 -0.04477176 -0.2512681 -0.315205765
## 5 0.88577303 -0.21520090 0.2346806 -0.197542323
## 6 -0.25737937 0.91300983 -0.6488778 -0.008419858
## AtRimFGA ShortMidRangeFGA LongMidRangeFGA Corner3FGA Arc3FGA
## 1 0.02709513 -0.88750832 -1.437959671 1.6717476 0.9297467
## 2 -0.45136947 -0.67813734 0.094181558 -0.3386467 0.8786837
## 3 -1.29197575 1.21267714 1.776398512 -0.8557200 -1.0778849
## 4 0.48839948 0.06595976 0.661788443 -0.2590110 -0.8169562
## 5 1.06594707 -0.22821716 -0.388328351 0.2917565 -0.2096058
## 6 -0.42098199 1.08869361 -0.001215618 -0.6615611 -0.2013249
## Arc3Frequency
## 1 1.0279797
## 2 0.8615833
## 3 -1.1231131
## 4 -0.8453387
## 5 -0.2473869
## 6 -0.1927090
# append cluster assignment
clust_pbp_offense <- data.frame(scale_pbp_offense, fit2$cluster)
# Cluster Plot against 1st 2 principal components
# vary parameters for most readable graph
clusplot(scale_pbp_offense, fit2$cluster, color=TRUE, shade=TRUE,
labels=2, lines=0)

K-Means Cluster Analysis 3
# K-Means Cluster Analysis for PBP rebounding, pace, and shot dist data
fit3 <- kmeans(scale_pbp_defense, 6) # 6 cluster solution
# get cluster means
aggregate(scale_pbp_defense,by=list(fit3$cluster),FUN=mean)
## Group.1 SecondsPerPossOff SecondsPerPossDef SecondsExcludingORebsPerPossOff
## 1 1 -0.3282081 1.2038265 -0.5061559
## 2 2 -0.3521976 0.3937434 -0.3437141
## 3 3 -1.7754061 1.6716000 -1.6799878
## 4 4 1.4501867 -1.1252650 1.4635925
## 5 5 -0.7986030 -0.5193231 -0.7834978
## 6 6 0.8916054 -0.3534332 0.8895803
## SecondsExcludingORebsPerPossDef Blocks RecoveredBlocks Steals
## 1 1.3345300 -0.3232369 -0.15915016 0.5421403
## 2 0.3727979 -0.3146294 -0.37625289 0.7075888
## 3 1.5723189 1.8125690 1.77233710 0.8985093
## 4 -1.0493668 -1.9404090 -1.91094987 -1.5368699
## 5 -0.5565556 0.1938767 -0.02016582 -0.1178991
## 6 -0.3358013 0.2255929 0.37265440 -0.4760875
## Rebounds DefRebounds OffRebounds SelfOReb
## 1 1.29573764 -0.01135785 1.9368062 2.3240228
## 2 -0.87726601 -0.68419780 -0.3967800 -0.3982174
## 3 0.31972876 1.66671444 -1.7287994 -0.8949506
## 4 -1.25841230 -0.81296165 -0.7918872 -1.1014776
## 5 1.00438361 0.90593149 0.2922360 0.2801172
## 6 0.02777182 -0.16467964 0.2588585 0.0849847
# append cluster assignment
clust_pbp_defense <- data.frame(scale_pbp_defense, fit3$cluster)
# Cluster Plot against 1st 2 principal components
# vary parameters for most readable graph
clusplot(scale_pbp_defense, fit3$cluster, color=TRUE, shade=TRUE,
labels=2, lines=0)

Visualizing Clusters
bref_final$Cluster = clust_adv_game$fit.cluster
ggplot(clust_adv_game, aes(x=TOV, y=Pace,
shape = as.factor(fit.cluster),
color = as.factor(fit.cluster))) +
geom_point(size=2.5)

reactable(bref_final, searchable = TRUE,
groupBy = "Cluster",
columns = list(
FGA = colDef(aggregate = "mean", format = colFormat(digits = 1)),
`3PA` = colDef(aggregate = "mean", format = colFormat(digits = 1)),
FTA = colDef(aggregate = "mean", format = colFormat(digits = 1)),
ORB = colDef(aggregate = "mean", format = colFormat(digits = 1)),
DRB = colDef(aggregate = "mean", format = colFormat(digits = 1)),
TRB = colDef(aggregate = "mean", format = colFormat(digits = 1)),
AST = colDef(aggregate = "mean", format = colFormat(digits = 1)),
STL = colDef(aggregate = "mean", format = colFormat(digits = 1)),
BLK = colDef(aggregate = "mean", format = colFormat(digits = 1)),
TOV = colDef(aggregate = "mean", format = colFormat(digits = 1)),
ORtg = colDef(aggregate = "mean", format = colFormat(digits = 1)),
DRtg = colDef(aggregate = "mean", format = colFormat(digits = 1)),
Pace = colDef(aggregate = "mean", format = colFormat(digits = 1))
)
)
pbpo_final$Cluster = clust_pbp_offense$fit2.cluster
ggplot(clust_pbp_offense, aes(x=OffPoss, y=PtsAssisted3s,
shape = as.factor(fit2.cluster),
color = as.factor(fit2.cluster))) +
geom_point(size=2.5)

reactable(pbpo_final, searchable = TRUE,
groupBy = "Cluster",
columns = list(
OffPoss = colDef(aggregate = "mean", format = colFormat(digits = 1)),
FG2A = colDef(aggregate = "mean", format = colFormat(digits = 1)),
FG3A = colDef(aggregate = "mean", format = colFormat(digits = 1)),
FtPoints = colDef(aggregate = "mean", format = colFormat(digits = 1)),
PtsUnassisted2s = colDef(aggregate = "mean", format = colFormat(digits = 1)),
PtsAssisted3s = colDef(aggregate = "mean", format = colFormat(digits = 1)),
PtsUnassisted3s = colDef(aggregate = "mean", format = colFormat(digits = 1)),
ShotQualityAvg.x = colDef(aggregate = "mean", format = colFormat(digits = 1)),
PtsPutbacks = colDef(aggregate = "mean", format = colFormat(digits = 1)),
ThreePtAssists = colDef(aggregate = "mean", format = colFormat(digits = 1)),
AtRimAssists = colDef(aggregate = "mean", format = colFormat(digits = 1)),
ShortMidRangeAssists = colDef(aggregate = "mean", format = colFormat(digits = 1)),
Corner3Assists = colDef(aggregate = "mean", format = colFormat(digits = 1)),
Avg3ptShotDistance = colDef(aggregate = "mean", format = colFormat(digits = 1)),
AtRimFGA = colDef(aggregate = "mean", format = colFormat(digits = 1)),
ShortMidRangeFGA = colDef(aggregate = "mean", format = colFormat(digits = 1)),
LongMidRangeFGA = colDef(aggregate = "mean", format = colFormat(digits = 1)),
Corner3FGA = colDef(aggregate = "mean", format = colFormat(digits = 1)),
Arc3FGA = colDef(aggregate = "mean", format = colFormat(digits = 1)),
Arc3Frequency = colDef(aggregate = "mean", format = colFormat(digits = 1))
)
)
pbp_defense$Cluster = clust_pbp_defense$fit3.cluster
ggplot(clust_pbp_defense, aes(x=Blocks, y=SecondsPerPossDef,
shape = as.factor(fit3.cluster),
color = as.factor(fit3.cluster))) +
geom_point(size=2.5)

reactable(pbp_defense, searchable = TRUE,
groupBy = "Cluster",
columns = list(
SecondsPerPossOff = colDef(aggregate = "mean", format = colFormat(digits = 1)),
SecondsPerPossDef = colDef(aggregate = "mean", format = colFormat(digits = 1)),
SecondsExcludingORebsPerPossOff = colDef(aggregate = "mean", format = colFormat(digits = 1)),
SecondsExcludingORebsPerPossDef = colDef(aggregate = "mean", format = colFormat(digits = 1)),
Blocks = colDef(aggregate = "mean", format = colFormat(digits = 1)),
RecoveredBlocks = colDef(aggregate = "mean", format = colFormat(digits = 1)),
Steals = colDef(aggregate = "mean", format = colFormat(digits = 1)),
Rebounds = colDef(aggregate = "mean", format = colFormat(digits = 1)),
DefRebounds = colDef(aggregate = "mean", format = colFormat(digits = 1)),
OffRebounds = colDef(aggregate = "mean", format = colFormat(digits = 1)),
SelfOReb = colDef(aggregate = "mean", format = colFormat(digits = 1))
)
)